In [6]:
# Import necessary libraries
#Nisha tyagi
!pip install wordcloud Wordcloud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
Requirement already satisfied: wordcloud in c:\users\91935\anaconda3\lib\site-packages (1.9.2)
Requirement already satisfied: numpy>=1.6.1 in c:\users\91935\anaconda3\lib\site-packages (from wordcloud) (1.24.3)
Requirement already satisfied: pillow in c:\users\91935\anaconda3\lib\site-packages (from wordcloud) (9.4.0)
Requirement already satisfied: matplotlib in c:\users\91935\anaconda3\lib\site-packages (from wordcloud) (3.7.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.0.5)
Requirement already satisfied: cycler>=0.10 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (23.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\91935\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
In [7]:
# Load the data

data = pd.read_csv('instagram_data.csv', encoding='ISO-8859-1')
df = pd.DataFrame(data)
df
Out[7]:
Impressions From Home From Hashtags From Explore From Other Saves Comments Shares Likes Profile Visits Follows Caption Hashtags
0 3920 2586 1028 619 56 98 9 5 162 35 2 Here are some of the most important data visua... #finance #money #business #investing #investme...
1 5394 2727 1838 1174 78 194 7 14 224 48 10 Here are some of the best data science project... #healthcare #health #covid #data #datascience ...
2 4021 2085 1188 0 533 41 11 1 131 62 12 Learn how to train a machine learning model an... #data #datascience #dataanalysis #dataanalytic...
3 4528 2700 621 932 73 172 10 7 213 23 8 Here’s how you can write a Python program to d... #python #pythonprogramming #pythonprojects #py...
4 2518 1704 255 279 37 96 5 4 123 8 0 Plotting annotations while visualizing your da... #datavisualization #datascience #data #dataana...
... ... ... ... ... ... ... ... ... ... ... ... ... ...
114 13700 5185 3041 5352 77 573 2 38 373 73 80 Here are some of the best data science certifi... #datascience #datasciencejobs #datasciencetrai...
115 5731 1923 1368 2266 65 135 4 1 148 20 18 Clustering is a machine learning technique use... #machinelearning #machinelearningalgorithms #d...
116 4139 1133 1538 1367 33 36 0 1 92 34 10 Clustering music genres is a task of grouping ... #machinelearning #machinelearningalgorithms #d...
117 32695 11815 3147 17414 170 1095 2 75 549 148 214 Here are some of the best data science certifi... #datascience #datasciencejobs #datasciencetrai...
118 36919 13473 4176 16444 2547 653 5 26 443 611 228 175 Python Projects with Source Code solved an... #python #pythonprogramming #pythonprojects #py...

119 rows × 13 columns

In [8]:
# Check for missing values
data.isnull().sum()
Out[8]:
Impressions       0
From Home         0
From Hashtags     0
From Explore      0
From Other        0
Saves             0
Comments          0
Shares            0
Likes             0
Profile Visits    0
Follows           0
Caption           0
Hashtags          0
dtype: int64
In [9]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Impressions     119 non-null    int64 
 1   From Home       119 non-null    int64 
 2   From Hashtags   119 non-null    int64 
 3   From Explore    119 non-null    int64 
 4   From Other      119 non-null    int64 
 5   Saves           119 non-null    int64 
 6   Comments        119 non-null    int64 
 7   Shares          119 non-null    int64 
 8   Likes           119 non-null    int64 
 9   Profile Visits  119 non-null    int64 
 10  Follows         119 non-null    int64 
 11  Caption         119 non-null    object
 12  Hashtags        119 non-null    object
dtypes: int64(11), object(2)
memory usage: 12.2+ KB
In [10]:
# Plot distribution of Impressions from different sources
plt.figure(figsize=(10, 8))
plt.style.use('fivethirtyeight')
plt.title("Distribution of Impressions From Home")
sns.distplot(data['From Home'])
plt.show()
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\2973166774.py:5: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['From Home'])
In [11]:
plt.figure(figsize=(10, 8))
plt.title("Distribution of Impressions From Hashtags")
sns.distplot(data['From Hashtags'])
plt.show()
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\3618955972.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['From Hashtags'])
In [12]:
plt.figure(figsize=(10, 8))
plt.title("Distribution of Impressions From Explore")
sns.distplot(data['From Explore'])
plt.show()
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\3391790389.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data['From Explore'])
In [13]:
# Plot pie chart for Impressions from different sources using Plotly
home = data["From Home"].sum()
hashtags = data["From Hashtags"].sum()
explore = data["From Explore"].sum()
other = data["From Other"].sum()

labels = ['From Home', 'From Hashtags', 'From Explore', 'Other']
values = [home, hashtags, explore, other]

fig = px.pie(data, values=values, names=labels, title="Impressions on Instagram Posts From Various Sources",hole=0.5)
fig.show()
In [14]:
# Generate and plot word cloud for captions and hashtags
text = " ".join(i for i in data.Caption)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
In [15]:
text_hashtags = " ".join(i for i in data.Hashtags)
stopwords_hashtags = set(STOPWORDS)
wordcloud_hashtags = WordCloud(stopwords=stopwords_hashtags, background_color="white").generate(text_hashtags)
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud_hashtags, interpolation="bilinear")
plt.axis("off")
plt.show()
In [16]:
# Scatter plots for relationships between variables using Plotly Express
fig = px.scatter(data_frame=data, x="Impressions", y="Likes", size="Likes", trendline="ols", title="Relationship Between Likes and Total Impressions")
fig.show()
In [17]:
fig = px.scatter(data_frame=data, x="Impressions", y="Comments", size="Comments", trendline="ols", title="Relationship Between Comments and Total Impressions")
fig.show()
In [18]:
fig = px.scatter(data_frame=data, x="Impressions", y="Shares", size="Shares", trendline="ols", title="Relationship Between Shares and Total Impressions")
fig.show()
In [19]:
fig = px.scatter(data_frame=data, x="Impressions", y="Saves", size="Saves", trendline="ols", title="Relationship Between Post Saves and Total Impressions")
fig.show()
In [20]:
# Calculate and print correlation values for different variables
correlation = data.corr()
print(correlation["Impressions"].sort_values(ascending=False))
Impressions       1.000000
From Explore      0.893607
Follows           0.889363
Likes             0.849835
From Home         0.844698
Saves             0.779231
Profile Visits    0.760981
Shares            0.634675
From Other        0.592960
From Hashtags     0.560760
Comments         -0.028524
Name: Impressions, dtype: float64
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\381329730.py:2: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [21]:
# Calculate and print conversion rate
conversion_rate = (data["Follows"].sum() / data["Profile Visits"].sum()) * 100
print(conversion_rate)
41.00265604249668
In [22]:
# Scatter plot for relationship between Profile Visits and Followers Gained
fig = px.scatter(data_frame=data, x="Profile Visits", y="Follows", size="Follows", trendline="ols", title="Relationship Between Profile Visits and Followers Gained")
fig.show()
In [23]:
# Train a PassiveAggressiveRegressor model
x = np.array(data[['Likes', 'Saves', 'Comments', 'Shares', 'Profile Visits', 'Follows']])
y = np.array(data["Impressions"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

model = PassiveAggressiveRegressor()
model.fit(xtrain, ytrain)
score = model.score(xtest, ytest)
print(score)
0.8544639282017199
In [24]:
# Predict with the trained model
features = np.array([[282.0, 233.0, 4.0, 9.0, 165.0, 54.0]])
prediction = model.predict(features)
print(prediction)
[10320.43240352]
In [ ]:
 
In [ ]: